This is my data visualization portfolio.
I gathered code I wrote to create some nice plots, put them in an Rmarkdown to create an HTML page where I would have an easy and centralized access to all of those code lines. The goal is to preview the graphs so I can identify the features I want to re-use while creating a new plot, and to be able to copy and paste the chunks I want.
The .Rmd file can run by itself, all the data used to draw the plots are either simulated, arbitrarily defined, or from a dataset.
Below is the list of all the required packages. Mostly graphical tools, along with some statistical analysis ressources.
# Tidy data
library(tidyverse)
library(lubridate)
# Supplementary analysis
library(survival)
library(survminer)
library(TraMineR)
library(rstatix)
library(epiR)
# Data
library(ISLR)
# plotly
library(plotly)
# ggplot2 and addins
library(ggplot2)
library(ggfortify)
library(ggpubr)
library(gridExtra)
library(ggseqplot)
library(ggh4x)
library(ggstatsplot)
# Other graphics tools
library(treemap)
Starting with the basics : histograms, barplots, pie charts…
Basic is good, but make it pretty.
An horizontal and stacked barplot :
# Simulate data for a barplot
dataBarplot <- data.frame(
Episode = rep(c("First Covid-19 episode", "Last Covid-19 episode"), each = 63), # Two Covid episodes
Time = factor(rep(rep(c("Acute phase", "3 months or more", "Still persisting"), each = 21), 2),
levels = c("Acute phase", "3 months or more", "Still persisting")), # Symptoms last for a given time
Symptoms = rep(c("Abdominal pain", "Anxiety", "Chest pain", "Cognitive disfunction", "Cough", # List of possible symptoms
"Depression", "Dizziness", "Fatigue", "Fever", "Gastrointestinal problem",
"Headache", "Joint pain", "Loss of smell or taste", "Menstruation issue", "Neuralgia",
"New allergies", "Shortness of breath", "Sleeping disorder", "Tachycardia", "Tinnitus",
"Troubled vision"), 6),
N = c(0, 0, 0, 0, 2, 5, 9, 285, 404, 714, 646, 870, 765, 1873, 2632, 2106, 3181, 3844, # Number of cases
3985, 4921, 3529, 0, 0, 0, 0, 0, 0, 3, 38, 51, 49, 82, 116, 119, 287, 142, 448,
190, 336, 158, 81, 502, 0, 1, 4, 6, 12, 12, 6, 315, 243, 192, 232, 212, 930, 394,
394, 902, 781, 351, 460, 92, 1318, 0, 0, 0, 0, 0, 0, 0, 71, 62, 119, 148, 162,
217, 123, 427, 347, 520, 669, 825, 730, 574, 0, 0, 0, 0, 1, 0, 0, 3, 7, 13, 7, 10,
22, 8, 13, 40, 15, 34, 18, 17, 36, 0, 1, 1, 2, 1, 2, 2, 61, 67, 59, 51, 58, 85,
217, 84, 200, 172, 79, 18, 115, 296)) %>%
# Total number of cases per symptoms and episode
mutate(Ntot = c(rep(by(N[Episode == "First Covid-19 episode"], Symptoms[Episode == "First Covid-19 episode"], sum), 3),
rep(by(N[Episode == "Last Covid-19 episode"], Symptoms[Episode == "Last Covid-19 episode"], sum), 3)))
# Initialize the plot with the dataset (DATA_ex2_1)
ggplot(dataBarplot) +
# Add a stacked bar chart (geom_col creates bars with heights determined by 'N')
geom_col(
mapping = aes(N, reorder(Symptoms, -N), fill = Time), # Map 'N' to x, 'Symptome' to y, and 'Temps' to fill color
width = 0.6, # Set the bar width
position = "stack" # Stack bars by the 'fill' variable
) +
# Create facets (separate panels) for each level of the 'Episode' variable
facet_grid(cols = vars(Episode)) +
# Add a vertical line at x = 0 for reference
geom_vline(aes(xintercept = 0)) +
# Customize the x-axis scale
scale_x_continuous(
limits = c(0, 6500), # Set the range for the x-axis
breaks = seq(0, 6000, 1000), # Major tick marks every 1000 units
minor_breaks = seq(500, 6500, 500), # Minor tick marks every 500 units
expand = c(0, 0) # Remove padding on the x-axis
) +
scale_fill_manual(
name = "Symptoms duration",
values = c('#28AFB0', '#F6803C', '#82354F')
) +
# Add text labels to display 'Ntot' values next to each bar
geom_text(
mapping = aes(Ntot + 25, y = Symptoms, label = Ntot), # Position and content of the text
hjust = 0, # Align text to the left of its position
nudge_x = 1 # Slightly nudge text horizontally
) +
# Add titles and subtitles (empty here for now)
labs(title = "", subtitle = "") +
# Customize the overall appearance of the plot
theme(
axis.title = element_blank(), # Remove axis titles
panel.background = element_rect(fill = "white"), # Set panel background to white
panel.grid.major.x = element_line(color = "darkgray"), # Major grid lines for x-axis
panel.grid.minor.x = element_line(colour = "lightgray"), # Minor grid lines for x-axis
legend.position = "bottom"
)
This one is vertical and dodged :
# Simulate data for a barplot
dataBarplot <- data.frame(
Time = factor(rep(c("Acute phase", "3 months or more", "Still persisting"), each = 14),
levels = c("Acute phase", "3 months or more", "Still persisting")), # Symptoms last for a given time
Symptoms = rep(c("Fatigue", "Loss of smell or taste", "Menstruation issue", "Neuralgia", "Fever", "Gastrointestinal problem",
"Headache", "Sleeping disorder", "Tachycardia", "Joint pain",
"New allergies", "Shortness of breath", "Tinnitus",
"Troubled vision"), 3),
N = c(285, 404, 714, 646, 870, 765, 1873, 2632, 2106, 3181, 3844, # Number of cases
3985, 4921, 3529, 38, 51, 49, 82, 116, 119, 287, 142, 448,
190, 336, 158, 81, 502, 315, 243, 192, 232, 212, 930, 394,
394, 902, 781, 351, 460, 92, 1318))
# Initialize the plot with the dataset (DATA_ex2_1)
ggplot(dataBarplot) +
# Add a stacked bar chart (geom_col creates bars with heights determined by 'N')
geom_col(
mapping = aes(reorder(Symptoms, -N), N, fill = Time), # Map 'N' to x, 'Symptome' to y, and 'Temps' to fill color
width = 0.9, # Set the bar width
position = "dodge" # Stack bars by the 'fill' variable
) +
# Add a vertical line at x = 0 for reference
geom_hline(aes(yintercept = 0)) +
# Customize the x-axis scale
scale_y_continuous(
limits = c(0, 5500), # Set the range for the x-axis
breaks = seq(0, 5000, 1000), # Major tick marks every 1000 units
minor_breaks = seq(500, 5500, 500), # Minor tick marks every 500 units
expand = c(0, 0) # Remove padding on the x-axis
) +
scale_fill_manual(
name = "Symptoms duration",
values = c('#28AFB0', '#F6803C', '#82354F')
) +
# Add text labels to display 'Ntot' values next to each bar
geom_text(
mapping = aes(x = Symptoms, y = N, label = N), # Position and content of the text
nudge_x = rep(c(-0.3, 0, 0.3), each = 14),
nudge_y = 120,
size = 2
) +
# Add titles and subtitles (empty here for now)
labs(title = "", subtitle = "") +
# Customize the overall appearance of the plot
theme(
axis.title = element_blank(), # Remove axis titles
panel.background = element_rect(fill = "white"), # Set panel background to white
panel.grid.major.y = element_line(color = "darkgray"), # Major grid lines for x-axis
panel.grid.minor.y = element_line(colour = "lightgray"), # Minor grid lines for x-axis
legend.position = "bottom",
axis.text.x = element_text(
size = 10, # Font size
angle = 45, # Rotate text 45 degrees
vjust = 1, # Vertical alignment
hjust = 1 # Horizontal alignment
)
)
I like to display percentages as stacked barplots, to make it obvious that values sum up to a hundred.
This one has and grid according to the status, and I feel like it is easier to read the proportions differences whenever the bars are horizontal :
# Simulate data for a bar plot
dataBarplot <- data.frame(
# Create a "Status" column with repeated categories (12 repetitions for each category)
Status = rep(c("Family member", "Active worker", "Retired worker"), each = 12),
# Create an "Affection" column, categorizing Covid-19 status, with ordered factor levels
Affection = factor(
rep(rep(c("No Covid-19", "Covid-19", "Long Covid-19"), each = 4), 3),
levels = c("No Covid-19", "Covid-19", "Long Covid-19")
),
# Create a "Social_satisfaction" column, representing satisfaction levels, with ordered factor levels
Social_satisfaction = factor(
rep(c("Very satisfied", "Satisfied", "Unsatisfied", "Very unsatisfied"), 9),
levels = c("Very satisfied", "Satisfied", "Unsatisfied", "Very unsatisfied")
),
# Define a numeric vector "N" representing counts corresponding to the combinations above
N = c(65, 255, 70, 16, 38, 131, 41, 7, 21, 123, 59, 10,
819, 2532, 591, 130, 640, 1723, 357, 38, 235, 1134, 468, 68,
1444, 4212, 677, 128, 528, 1198, 164, 20, 160, 575, 166, 27)
) %>%
# Add new columns using dplyr's mutate function
mutate(
# Compute the total count (Ntot) for each "Affection" group within each "Status" group
Ntot = c(
# For "Family member", calculate totals per "Affection" group
rep(by(N[Status == "Family member"], Affection[Status == "Family member"], sum), each = 4),
# For "Active worker", calculate totals per "Affection" group
rep(by(N[Status == "Active worker"], Affection[Status == "Active worker"], sum), each = 4),
# For "Retired worker", calculate totals per "Affection" group
rep(by(N[Status == "Retired worker"], Affection[Status == "Retired worker"], sum), each = 4)
),
# Calculate percentages (P) for each count (N) relative to the total count (Ntot) per group
P = N / Ntot * 100
)
# Initialize the plot with the dataset (DATA_ex2_2)
ggplot(dataBarplot) +
# Add a stacked bar chart
geom_col(
mapping = aes(P, Affection, fill = Social_satisfaction), # Map 'P' to x, 'COVID_long_3mod' to y, and 'Q_E1' to fill
width = 0.6, # Set the bar width
position = "stack", # Stack bars by the 'fill' variable
color = "white" # Add white border to bars
) +
# Create facets (separate rows) for each level of the Status variable
facet_grid(rows = vars(Status)) +
# Add vertical lines at x = 0 and x = 100 for reference
geom_vline(aes(xintercept = 0)) +
geom_vline(aes(xintercept = 100)) +
# Customize the x-axis scale
scale_x_continuous(
limits = c(0, 100.2), # Set the range for the x-axis
breaks = seq(0, 100, 10), # Major tick marks every 10%
minor_breaks = seq(5, 95, 10), # Minor tick marks every 5%
expand = c(0, 0), # Remove padding on the x-axis
labels = paste0(seq(0, 100, 10), "%") # Append "%" to the tick labels
) +
# Add titles and subtitles (empty here for now)
labs(title = " ", subtitle = "") +
# Customize the overall appearance of the plot
theme(
axis.title = element_blank(), # Remove axis titles
# axis.text.x = element_blank(), # Uncomment to hide x-axis labels
# axis.text.y = element_blank(), # Uncomment to hide y-axis labels
panel.background = element_rect(fill = "white"), # Set panel background to white
panel.grid.major.x = element_line(color = "darkgray"), # Major grid lines for x-axis
panel.grid.minor.x = element_line(colour = "lightgray") # Minor grid lines for x-axis
) +
# Customize the fill legend for the bar chart
scale_fill_manual(
"Social interactions", # Title for the legend
values = rev(RColorBrewer::brewer.pal(4, "RdYlGn")), # Reverse color palette
labels = c( # Define labels for legend items
"Very satisfied", # Very satisfactory
"Satisfied", # Somewhat satisfactory
"Unsatisfied", # Somewhat unsatisfactory
"Very unsatisfied" # Very unsatisfactory
)
)
Adding some percentages labels and handling long variables names :
# Create the data frame for the bar plot
dataBarplot <- data.frame(
# "LTI" column with long-term illness categories repeated for each age group
LTI = factor(rep(
c("Leukaemia", "Diabetes (insipidus & mellitus)", "Epilepsy", "Multiple Sclerosis",
"Parkinson & affiliated", "Mental Illness"), 5),
levels = c("Leukaemia", "Diabetes (insipidus & mellitus)", "Epilepsy", "Multiple Sclerosis",
"Parkinson & affiliated", "Mental Illness")
),
# "Age" column with age group categories repeated for each illness type
Age = factor(rep(
c("Under 18", "18 to 24 years old", "25 to 44 years old", "45 to 59 years old", "Over 60"), each = 6),
levels = c("Under 18", "18 to 24 years old", "25 to 44 years old", "45 to 59 years old", "Over 60")
),
# Randomly generated values for the "N" column representing counts
N = sample(100:300, 30)
) %>%
# Calculate the percentage contribution of each count (N) within each illness type (LTI)
mutate(
percent = as.numeric(unlist(by(N, LTI, function(x) {x / sum(x) * 100})))[rep(seq(1, 26, 5), 5) + rep(0:4, each = 6)],
# Label for percentages, shown only if the value exceeds 2%
lab = ifelse(percent > 2, paste0(formatC(percent, 1, format = "f"), "%"), "")
)
# Initialize a vector for positioning text labels within each bar
x_text <- rep(NA, nrow(dataBarplot))
# Calculate the cumulative y-position for placing the text labels
for (i in unique(dataBarplot$LTI)) {
for (j in 1:sum(dataBarplot$LTI == i)) {
ifelse(j == 1,
x_text[which(dataBarplot$LTI == i)[j]] <- dataBarplot$percent[dataBarplot$LTI == i][j] / 2,
x_text[which(dataBarplot$LTI == i)[j]] <- sum(dataBarplot$percent[dataBarplot$LTI == i][1:(j-1)]) +
dataBarplot$percent[dataBarplot$LTI == i][j] / 2)
}
}
dataBarplot$x_text <- 100 - x_text # Adjust text position to align with the plot
# Generate the bar plot
ggplot(data = dataBarplot) +
# Create stacked bars for proportions by age group
geom_col(
mapping = aes(x = LTI, y = percent, fill = Age),
width = 0.6, # Bar width
position = "stack", # Stacked bar position
color = "white" # Bar border color
) +
# Add percentage labels to each segment of the bar
geom_text(
aes(x = LTI, y = x_text, label = lab),
size = 2, # Font size
colour = "white" # Text color
) +
# Add titles and axis labels
labs(
title = "Long term illnesses per age group", # Title
ylab = "Proportion", # Y-axis label
xlab = "" # X-axis label (empty)
) +
# Customize the y-axis scale
scale_y_continuous(
breaks = seq(0, 100, 25), # Tick intervals
labels = c("0%", "25%", "50%", "75%", "100%") # Custom tick labels
) +
# Customize the fill colors for the "Age" categories
scale_fill_manual(
values = c('#1E5471', '#28AFB0', '#F6803C', '#82354F', '#E0D6FF'),
name = "Classe ATC1" # Legend title
) +
# Apply a minimal theme to the plot
theme_minimal() +
# Customize the legend and x-axis text
theme(
legend.position = "right", # Position the legend on the right
axis.text.x = element_text(
size = 10, # Font size
angle = 45, # Rotate text 45 degrees
vjust = 1, # Vertical alignment
hjust = 1 # Horizontal alignment
)
)
A very basic one. But adding a second axis with error-bars representing the incidence rate CI95% :
# Define maximum y-axis value and coefficient for secondary y-axis scaling
ymax <- 200
coeff <- 70 / 200
# Create data frame for bar plot
dataBarplot <- data.frame(
year = 2012:2022, # Years from 2012 to 2022
n_cases = c(120, 87, 78, 96, 117, 153, 139, 164, 181, 188, 163), # Number of cases per year
pop = rep(300000, 11) # Constant population size of 300,000
) %>%
mutate(
# Calculate incidence rate (IR) per 100,000 population
IR = n_cases / pop * 100000,
# Calculate confidence intervals for the incidence rate
IR_lower = epi.conf(cbind(n_cases, pop), ctype = "inc.rate", method = "exact")[, 2] * 100000,
IR_upper = epi.conf(cbind(n_cases, pop), ctype = "inc.rate", method = "exact")[, 3] * 100000
)
# Create the plot
ggplot(dataBarplot, aes(x = year, y = n_cases, width = 0.95)) +
# Add bars for number of cases
geom_col(
mapping = aes(fill = "Incidence"),
fill = "#AAC0AF" # Bar color
) +
# Add a line for incidence rate (IR)
geom_line(
aes(
x = year,
y = IR,
colour = "Incidence rate" # Legend label for line
),
linetype = 1, # Solid line
size = 1, # Line thickness
colour = "black" # Line color
) +
# Add error bars for confidence intervals of the incidence rate
geom_errorbar(
aes(
x = year,
ymin = IR_lower, # Lower bound of the confidence interval
ymax = IR_upper, # Upper bound of the confidence interval
width = 0.2 # Error bar width
)
) +
# Add text labels for the number of cases above each bar
geom_text(
aes(x = year, y = n_cases, label = n_cases),
nudge_y = 7, # Offset text above the bars
size = 6 # Font size
) +
# Customize the y-axis
scale_y_continuous(
expand = c(0, 0), # Remove padding around the axis
limits = c(0, ymax), # Set y-axis limits
breaks = seq(0, ymax, 25), # Major tick marks
name = "Incidence", # Left y-axis label
sec.axis = sec_axis(
trans = ~ . * coeff, # Transformation for secondary axis
name = "Incidence rate", # Right y-axis label
breaks = seq(0, ymax * coeff, 10) # Tick marks for secondary axis
)
) +
# Customize the x-axis
scale_x_continuous(
breaks = 2012:2022, # Tick marks for each year
labels = 2012:2022, # Labels for each year
name = "" # Empty label for the x-axis
) +
# Apply a classic theme
theme_classic() +
# Customize grid lines, axis text, and axis colors
theme(
panel.grid.major.y = element_line(colour = "grey"), # Major grid lines
panel.grid.minor.y = element_line(colour = "lightgray"), # Minor grid lines
axis.text.x = element_text(size = 11), # Font size for x-axis labels
axis.line.y.left = element_line(color = "#AAC0AF"), # Color for left y-axis line
axis.title.y.left = element_text(color = "#AAC0AF"), # Color for left y-axis title
axis.text.y.left = element_text(color = "#AAC0AF") # Color for left y-axis labels
) +
# Add titles and remove legend title
labs(
title = "", # Plot title
fill = "", # Legend title (empty)
x = " ", # x-axis title (empty)
y = " " # y-axis title (empty)
)
I don’t really enjoy boxplots as I feel they get quite boring. But you can add some twists to make it interesting !
Basic setup :
# Create a data frame for boxplot visualization
dataBoxplot <- data.frame(
med_id = rep(1:8, c(8, 3, 3, 5, 7, 6, 4, 11)), # Assign medical operator IDs, repeated based on surgery counts
op_id = c(1:8, 1:3, 1:3, 1:5, 1:7, 1:6, 1:4, 1:11), # Assign operation IDs for each surgery
setup = pmax(10 - rpois(47, lambda = 4), 1) # Simulate 'setup' scores, capped at 10 and minimum of 1
)
# Create a data frame for the line plot (average setup values per operation ID)
dataLine <- data.frame(
op_id = 1:max(dataBoxplot$op_id), # Sequence of operation IDs
setup = as.numeric(by(dataBoxplot$setup, dataBoxplot$op_id, mean)) # Compute mean 'setup' scores by op_id
)
# Generate the plot using ggplot2
ggplot(dataBoxplot) +
# Add a boxplot layer for setup scores grouped by operation ID
geom_boxplot(
aes(group = op_id, x = op_id, y = setup) # Map operation ID and setup scores to boxplot
) +
# Add a red line connecting the mean setup values for each operation ID
geom_line(
data = dataLine, # Use the data frame with mean setup values
aes(op_id, setup), # Map operation ID to x and mean setup to y
color = "red" # Set line color to red
) +
# Add red points for the mean setup values
geom_point(
data = dataLine, # Use the data frame with mean setup values
aes(op_id, setup), # Map operation ID to x and mean setup to y
color = "red", # Set point color to red
shape = 16 # Use solid circle for points
) +
annotate(
"text",
x = 1:max(dataBoxplot$op_id), y = 10,
label = paste0("(n=", table(dataBoxplot$op_id),")")
) +
# Add a subtitle with the p-value from a linear model (setup ~ op_id)
labs(subtitle = paste0("p=", round(summary(lm(setup ~ op_id, dataBoxplot))$coefficient[2, 4], 4))) +
# Label the y-axis
ylab("Surgery setup grade (out of 10)") +
# Label the x-axis
xlab("Number of surgeries") +
# Customize the y-axis scale
scale_y_continuous(
limits = c(0, 10.5), # Set y-axis limits between 0 and 10
breaks = 0:10, # Add breaks at each integer value
expand = c(0, 0) # Remove padding on the axis
) +
# Customize the x-axis scale
scale_x_discrete(
limits = 1:max(dataBoxplot$op_id), # Limit x-axis to the range of operation IDs
breaks = 1:max(dataBoxplot$op_id) # Add breaks at each operation ID
) +
# Apply a minimal theme for clean visualization
theme_minimal()
Jitter boxplots fixes one of the classic boxplots downside by adding the points to give a better idea of the data distribution :
dataBoxplot <- data.frame(
y = rnorm(200),
Group = sample(LETTERS[1:3], size = 200, replace = TRUE)
)
# Box plot by group with jitter
ggplot(dataBoxplot, aes(x = Group, y = y, colour = Group)) +
geom_boxplot(outlier.shape = NA) + # Box plot without showing outliers
geom_jitter(width = 0.2) + # Add jittered points for individual observations
theme_minimal() + # Use a minimal theme for clean visualization
xlab("Group") + # X-axis label
ylab("Some randomly generated values") # Y-axis label
Adding some two-by-two testing :
# Simulate data for a boxplot
dataBoxplot <- data.frame(
id = rep(1:47, 5), # 47 subjects, repeated 5 times for each group
times = rep(paste0("Month ", c(0, 3, 6, 9, 12)), each = 47), # 5 time points
val = c( # Simulate values for each time point with different means and SDs
rnorm(n = 47, mean = 2, sd = 1), # Group 1 (mean=2, sd=1)
rnorm(n = 47, mean = 5, sd = 1), # Group 2 (mean=5, sd=1)
rnorm(n = 47, mean = 5, sd = 2), # Group 3 (mean=5, sd=2)
rnorm(n = 47, mean = 8, sd = 2), # Group 4 (mean=8, sd=2)
rnorm(n = 47, mean = 8, sd = 5) # Group 5 (mean=8, sd=5)
)
)
dataBoxplot$val <- pmax(0, dataBoxplot$val)
# Perform ANOVA to test differences between the groups
res.aov <- anova_test(
data = dataBoxplot,
dv = val, # Dependent variable: val (size)
wid = id, # Within-subjects factor: id
between = times # Between-subjects factor: times
)
# Pairwise t-tests to compare each group with group 1, adjusting for multiple comparisons using Bonferroni correction
pwc <- dataBoxplot %>%
pairwise_t_test(
val ~ times, # Dependent variable 'val' across levels of 'times'
paired = TRUE, # Paired samples
ref.group = "Month 0", # Use "n°1" as the reference group
p.adjust.method = "bonferroni" # Apply Bonferroni correction to p-values
) %>%
add_xy_position(x = "times") # Add xy-position for displaying p-values
# Create the boxplot with individual data points and p-values
ggboxplot(
dataBoxplot, # Data for the boxplot
x = "times", # Group by 'times'
y = "val", # Plot 'val' (size) on the y-axis
add = "point", # Add individual data points to the boxplot
ylab = "Size", # Label for the y-axis
xlab = "" # No label for the x-axis
) +
scale_y_continuous(
limits = c(0, 25),
expand = c(0, 0)
) +
stat_pvalue_manual(pwc) + # Add p-values from pairwise t-tests
labs(
subtitle = get_test_label(res.aov, detailed = FALSE), # Subtitle for ANOVA result
caption = "2 by 2 testing: Student's t-test, p-value adjustment: Bonferroni" # Caption explaining test details
)
Switching to violins plots to add some info and some nice colors :
ggstatsplot::ggbetweenstats(ISLR::Wage, education, wage, "np") +
ylab("Workers raw wage") +
xlab("Education level")
This section is dedicated to regression models outputs and other associated graph.
Starting with the survival analysis, the most classic graph is the descending survival curve :
# Simulate survival data
n <- 1000 # Number of individuals
dataSurvCurve <- data.frame(
id = 1:n, # Unique identifier for each individual
time = as.numeric(sample(1:365, n, replace = TRUE)), # Simulated follow-up time (1 to 365 days)
event = factor(c(
sample(0:1, n/2, prob = c(0.4, 0.6), replace = TRUE), # Events for the unexposed group
sample(0:1, n/2, prob = c(0.7, 0.3), replace = TRUE) # Events for the exposed group
)), # Event status as a factor: 0 = censored, 1 = event
exposure = rep(c("Unexposed", "Exposed"), each = n/2) # Exposure groups
)
# Adjust time for censored observations (event == 0)
dataSurvCurve$time[dataSurvCurve$event == 0] <- 365
# Fit the survival model with event stratified by exposure
fit <- survfit(Surv(time = time, event = event) ~ exposure, data = dataSurvCurve)
# Define the upper limit of the y-axis in percentage
ymax <- 30
# Plot survival curves using autoplot from the ggplot2 framework
autoplot(fit, surv.linetype = 'blank', surv.colour = NA) + # Start with a blank base plot
geom_step(aes(linetype = strata, color = event), size = 0.8) + # Add step lines by strata and event
scale_linetype_manual(
name = "Group", # Legend title for group
values = c('dotted', 'solid'), # Line types for the groups
labels = c('Non-exposed', 'Exposed') # Labels for the groups
) +
scale_color_manual(
name = "Event", # Legend title for events
values = c("white", "#1C4073"), # Colors for censored and events
labels = c(' ', "Hospitalised") # Labels for the events
) +
labs(
x = "Follow up period (days)", # X-axis label (French: Follow-up period in days)
y = "Proportion of population", # Y-axis label
title = "" # Empty title
) +
geom_vline(aes(xintercept = 0), size = 1) + # Add vertical line at x = 0
geom_hline(aes(yintercept = 1 - ymax / 100), size = 1) + # Add horizontal line for the ymax threshold
scale_y_reverse(
limits = 1 - c(ymax / 100, 1), # Reverse y-axis to represent decreasing survival
breaks = 1 - seq(ymax / 100, 1, 0.1), # Break points for y-axis
labels = paste0(seq(ymax, 100, 10), "%"), # Labels as percentages
expand = c(0, 0) # No expansion around the limits
) +
scale_x_continuous(
breaks = seq(0, 360, 60), # X-axis breaks every 60 days
expand = c(0, 0) # No expansion around the limits
) +
theme_minimal() + # Use a minimal theme
theme(
plot.title = element_text(hjust = 0.5), # Center align the title
axis.title.x = element_text(face = "bold", colour = "black", size = 12), # Style x-axis title
axis.title.y = element_text(face = "bold", colour = "black", size = 12), # Style y-axis title
legend.title = element_text(face = "bold", size = 10), # Style legend title
panel.grid.major.y = element_line(colour = "lightgray", size = 0.3), # Grid for y-axis
panel.grid.minor.x = element_blank(), # Remove minor grid for x-axis
legend.position = c(0.15, 0.30), # Position legend inside plot
legend.background = element_rect(fill = "transparent", color = "white") # Transparent background for legend
)
Second version of almost the same curve, adding some confidence interval using ggsurvplot() :
# Simulate survival data for analysis
n <- 1000 # Number of individuals
dataSurvCurve <- data.frame(
id = 1:n, # Unique identifier for each individual
time = as.numeric(sample(1:365, n, replace = TRUE)), # Simulated follow-up time between 1 and 365 days
event = c(
sample(0:1, n/2, prob = c(0.4, 0.6), replace = TRUE), # First group: 40% censored, 60% events
sample(0:1, n/2, prob = c(0.7, 0.3), replace = TRUE) # Second group: 70% censored, 30% events
),
# Event status: 0 = censored, 1 = event occurred
exposure = rep(c("Unexposed", "Exposed"), each = n/2) # Exposure status: "Unexposed" or "Exposed"
)
# Fit survival curves stratified by exposure
fit <- survfit(Surv(time = time, event = event) ~ exposure, data = dataSurvCurve)
# Plot the survival curves using ggsurvplot
ggsurvplot(
fit, # Survival fit object
data = dataSurvCurve, # Data source
legend.title = "Group", # Title for the legend
legend.labs = c("Exposed", "Unexposed"), # Labels for legend groups
conf.int = TRUE, # Display confidence intervals
censor = FALSE, # Do not display censoring marks
xlab = "Follow up time (days)", # Label for x-axis
ylab = "Non-hospitalisation probability", # Label for y-axis
break.time.by = 60, # Breaks on the x-axis every 60 days
surv.scale = "percent", # Scale survival probability as percentages
ylim = c(0, 1), # Set y-axis limits (0% to 100%)
axes.offset = FALSE, # Align axes at the origin
legend = c(0.12, 0.15) # Position the legend within the plot
)
Let’s add one more event and have an some components (I would use ggsurvplot(), but it does not handle a survfit object having a non binary event) :
# Simulating survival data
n <- 1000 # Number of individuals
dataSurvCurve <- data.frame(
id = 1:n,
time = as.numeric(sample(1:365, n, replace = TRUE)), # Simulated follow-up time between 1 and 365 days
event = factor(c(sample(0:2, n/2, prob = c(0.4, 0.4, 0.2), replace = TRUE),
sample(0:2, n/2, prob = c(0.2, 0.3, 0.5), replace = TRUE))), # Event (0 = censored, 1 = sick, 2 = dead)
exposure = rep(c("Unexposed", "Exposed"), each = n/2) # Exposure status (Unexposed or Exposed)
)
dataSurvCurve$time[dataSurvCurve$event == 0] <- 365 # Set time to 365 for censored observations (event == 0)
fit <- survfit(Surv(time = time, event = event) ~ exposure, data = dataSurvCurve) # Fitting a Survival Model
# Plot a survival curve with `autoplot` and enhance it with additional layers
autoplot(fit, surv.linetype = 'blank', surv.colour = NA) +
# Add step lines representing survival curves
geom_step(
aes(linetype = strata, color = event), # Map line type to `strata` and color to `event`
size = 0.8 # Set line thickness
) +
# Customize the line types for survival curves
scale_linetype_manual(
name = "Group", # Title for the legend
values = c('dashed', 'solid'), # Define line types: dashed for unexposed, solid for exposed
labels = c('Unexposed', 'Exposed') # Labels for the legend
) +
# Customize the colors for the event states
scale_color_manual(
name = "State", # Title for the legend
values = c("#AAC0AF", "#1C4073", "#B28B84"), # Colors for Healthy, Sick, and Dead states
labels = c('Healthy', 'Sick', 'Dead') # Labels for the legend
) +
# Add axis labels and a title
labs(
x = "Followup period (days)", # X-axis label
y = "Proportion of the population", # Y-axis label
title = "" # Title (empty for now)
) +
# Add a vertical reference line at x = 0
geom_vline(
aes(xintercept = 0), # Position of the line
size = 1 # Thickness of the line
) +
# Customize the y-axis scale
scale_y_continuous(
limits = c(0, 1), # Set the y-axis range from 0 to 1
breaks = seq(0, 1, 0.1), # Major ticks every 0.1
labels = paste0(seq(0, 100, 10), "%"), # Convert tick labels to percentages
expand = c(0, 0) # Remove padding on the y-axis
) +
# Customize the x-axis scale
scale_x_continuous(
breaks = seq(0, 360, 60), # Major ticks every 60 days
expand = c(0, 0) # Remove padding on the x-axis
) +
# Apply a minimal theme for a clean appearance
theme_minimal() +
# Customize specific theme elements
theme(
plot.title = element_text(hjust = 0.5), # Center-align the plot title
axis.title.x = element_text(
face = "bold", # Make x-axis title bold
colour = "black", # Set title color to black
size = 12 # Set font size
),
axis.title.y = element_text(
face = "bold", # Make y-axis title bold
colour = "black", # Set title color to black
size = 12 # Set font size
),
legend.title = element_text(face = "bold", size = 10), # Style legend titles
panel.grid.major.y = element_line(
colour = "lightgray", # Light gray major grid lines on the y-axis
size = 0.3 # Set line thickness
),
panel.grid.minor.x = element_blank() # Remove minor grid lines on the x-axis
)
Lastly, combining different curves using ggsurvplot() :
# Create a data frame for survival analysis
dataSurvCurve <- data.frame(
os.time = colon$time, # Overall survival time from 'colon' dataset
os.status = colon$status, # Overall survival event status (1 = event, 0 = censored)
pfs.time = sample(colon$time), # Progression-free survival time, randomized for illustration
pfs.status = colon$status, # Progression-free survival event status (1 = event, 0 = censored)
sex = colon$sex, # Patient sex
rx = colon$rx, # Treatment type
adhere = colon$adhere # Adherence to treatment
)
# Fit survival curves for progression-free survival (PFS) by treatment type
fit.pfs <- survfit(Surv(pfs.time, pfs.status) ~ rx, data = dataSurvCurve)
# Fit survival curves for overall survival (OS) by treatment type
fit.os <- survfit(Surv(os.time, os.status) ~ rx, data = dataSurvCurve)
# Combine the survival fits into a list for plotting
fits <- list(PFS = fit.pfs, OS = fit.os)
# Plot the survival curves using ggsurvplot
ggsurvplot(
fits, # List of survival fits (PFS and OS)
data = dataSurvCurve, # Data used for plotting
combine = TRUE, # Combine PFS and OS plots into one
censor = FALSE, # Do not display censoring marks
palette = "jco", # Use JCO color palette
surv.scale = "percent", # Display survival probabilities as percentages
ylim = c(0, 1), # Set y-axis limits (0% to 100% survival)
axes.offset = FALSE, # Align axes at the origin
legend = "right", # Position the legend on the right
xlab = "Survival time (years)",# Label for x-axis
xscale = "d_y", # Scale x-axis from days to years
break.x.by = 365.25, # Breaks on the x-axis every year
legend.title = "Treatment type" # Title for the legend
)
Or display the different cruves in a grid :
fit <- survfit( Surv(time, status) ~ sex, data = colon)
ggsurvplot_facet(fit, colon, facet.by = "rx",
palette = "jco", # Combine PFS and OS plots into one
censor = FALSE, # Use JCO color palette
surv.scale = "percent", # Display survival probabilities as percentages
ylim = c(0.3, 1), # Set y-axis limits (0% to 100% survival)
axes.offset = FALSE, # Align axes at the origin
legend = c("PFS", "OS"), # Position the legend on the right
xlab = "Survival time (years)",# Label for x-axis
xscale = "d_y", # Scale x-axis from days to years
break.x.by = 365.25, # Breaks on the x-axis every year
legend.title = "Treatment type", # Title for the legend)
conf.int = TRUE
)
Forest plots are the go to graph for any type of regression that present results as exponential of it’s coefficients. I really like the existing packages displaying the OR and CI95% so I tried to reproduce it.
# Create the data frame for the forest plot
dataForest <- data.frame(
var = c("Age", "Age", "Age", "Age", "Sex", "Sex", "Diabetes", "Diabetes", "Smoker", "Smoker"), # Variables
mod = c("18-24", "25-44", "45-64", "64+", "Male", "Female", "No", "Yes", "No", "Yes"), # Categories or modalities within variables
estimate = c(NA, 1.05, 1.38, 1.72, NA, 0.87, NA, 2.48, NA, 0.97), # Point estimates (Odds Ratios)
conf.low = c(NA, 0.92, 1.21, 1.44, NA, 0.72, NA, 2.13, NA, 0.88), # Lower bounds of confidence intervals
conf.high = c(NA, 1.22, 1.63, 1.87, NA, 1.01, NA, 2.67, NA, 1.05) # Upper bounds of confidence intervals
) %>%
mutate(
id = 1:10, # Assign a unique ID to each row
label = ifelse(is.na(estimate), "Ref",
paste0(formatC(estimate, 2, format = "f"), " [",
formatC(conf.low, 2, format = "f"), "-",
formatC(conf.high, 2, format = "f"), "]")),
# Create labels for display: "estimate [conf.low-conf.high]" or "Ref" if missing
estimate = ifelse(is.na(estimate), 1, estimate), # Replace missing estimates with 1 (reference)
conf.low = ifelse(is.na(conf.low), 1, conf.low), # Replace missing lower bounds with 1 (reference)
conf.high = ifelse(is.na(conf.high), 1, conf.high), # Replace missing upper bounds with 1 (reference)
signif = ifelse(conf.low <= 1 & conf.high >= 1, "p>0.05", "p<0.05")
# Determine significance: overlaps 1 (p>0.05) or not (p<0.05)
)
# Create the forest plot using ggplot2
ggplot(dataForest) +
geom_vline(
xintercept = 1, # Reference line at Odds Ratio = 1
color = "gray25",
linetype = 2 # Dashed line
) +
geom_vline(
xintercept = c(0.25, 0.5, 2, 4), # Additional vertical lines for log-scale guidance
color = "gray75",
linetype = "dotted"
) +
geom_errorbar(
aes(
y = reorder(mod, -id), # Reorder y-axis labels based on ID
xmin = conf.low, # Lower bound of the confidence interval
xmax = conf.high # Upper bound of the confidence interval
),
position = position_dodge(0.5), # Adjust error bar position
width = 0.4, # Width of the error bars
size = 0.8 # Thickness of the error bars
) +
geom_point(
aes(x = estimate, y = reorder(mod, -id), color = signif), # Add points for the estimates
position = position_dodge(width = 0.5), # Adjust point position
size = 2.5 # Size of the points
) +
scale_x_log10( # Use a logarithmic scale for x-axis
breaks = c(0.5, 1, 2, 4), # Specify tick marks
limits = c(0.49, 6) # Set x-axis limits
) +
scale_color_manual(
name = "Significance", # Legend title for significance
values = c("red", "black") # Colors for significant and non-significant points
) +
facet_grid(
rows = vars(var), # Create separate panels for each variable
scales = "free", # Allow scales to adjust freely per panel
switch = "both" # Switch facet labels to both sides
) +
xlab("Odds ratio") + # Label for the x-axis
ylab("") + # Remove y-axis label
theme_classic() + # Use a clean, minimal theme
theme(panel.grid.major.y = element_line(colour = "lightgray")) + # Add gridlines for readability
geom_text(
aes(x = 5, y = reorder(mod, -id), label = dataForest$label), # Add text labels for estimates
)